import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from google.colab import drive
#drive.mount('/content/drive')
train_df = pd.read_csv("train_data.csv", sep=",", na_values=[" ","?","#"])
test_df = pd.read_csv("test_data.csv", sep=",", na_values=[" ","?","#"])
discount_df = pd.read_csv("discount_features.csv", sep=",", na_values=[" ","?","#"]
,parse_dates=['date'])
foot_fall_df = pd.read_csv("foot_fall.csv", sep=",", na_values=[" ","?","#"])
print(train_df.shape)
print(test_df.shape)
print(discount_df.shape)
print(foot_fall_df.shape)
train_df.head()
train_df.tail()
The train data is dated from 2nd january 2015 to 30th June 2018
train_df.describe()
len(train_df[train_df["sales"]<0])
1672 rows have negative sales
test_df.head()
test_df.tail()
The test data ranges from 1st July 2018 to 31st July 2018
discount_df.head()
discount_df.tail()
We have discount data from 1 April 2016 to 31 July 2018
foot_fall_df.head(10)
train_df["sales"].abs().min()
## Checking for NA values
train_df.isna().sum()
test_df.isna().sum()
foot_fall_df.isna().sum()
discount_df.isna().sum()
Initially there are no na values in the data
discount_df.dtypes
## Creating Copies of the datasets
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
discount_df_copy = discount_df.copy()
foot_fall_df_copy = foot_fall_df.copy()
## dismantling the date column in discount_features_copy into year, month and day
discount_df_copy.dtypes
discount_df_copy['year'] = discount_df_copy['date'].dt.year
discount_df_copy['month'] = discount_df_copy['date'].dt.month
discount_df_copy['day'] = discount_df_copy['date'].dt.day
discount_df_copy.head()
train_df_discounted_with_date = pd.merge(train_df_copy, discount_df_copy
,how="inner", on=["year","month","day","city","medicine"])
train_df_discounted_with_date.head()
## Dropping the date column in discount_features_copy
discount_df_copy.drop("date", axis = 1, inplace=True)
discount_df_copy.head()
## Merging the train_data_copy and the discounted_data_copy for adding discount feature
train_df_discounted = pd.merge(train_df_copy, discount_df_copy
,how="left", on=["year","month","day","city","medicine"])
print(train_df_copy.shape)
print(train_df_discounted.shape)
##train_discounted.drop_duplicates(inplace=True)
print(train_df_copy.shape)
print(train_df_discounted.shape)
train_df_discounted.head()
train_df_discounted.tail()
train_df_discounted.isna().sum()
train_df_discounted.fillna(0,inplace=True)
train_df_discounted.head()
## Dropping id column from the test_data_copy()
test_df_copy.drop("id", axis=1, inplace=True)
test_df_copy.head()
## Merging and test and discounted
test_df_discounted = pd.merge(test_df_copy, discount_df_copy, how="inner"
,on=["year","month","day","city","medicine"])
test_df_discounted.head()
print(test_df_copy.shape)
print(test_df_discounted.shape)
print(train_df_discounted.medicine.nunique())
print(test_df_discounted.medicine.nunique())
train_neg_df_discounted = train_df_discounted[train_df_discounted["sales"]<0]
train_neg_df_discounted.shape
train_neg_df_discounted.head()
train_neg_df_discounted.isna().sum()
train_neg_df_discounted.medicine.nunique()
train_df_discounted.drop(train_df_discounted[train_df_discounted["sales"]<0].index,axis=0,inplace=True)
train_df_discounted.shape
train_df_discounted.dtypes
cat_columns = ["year","month","day","city","medicine","discounted"]
def change_to_category(df,col_list):
for col in col_list:
df[col] = df[col].astype("category")
return df
train_discounted = change_to_category(train_df_discounted,cat_columns)
train_df_discounted.dtypes
## Creating City wise datasets
city_frame = {}
for city in train_df_discounted.city.unique():
df = train_df_discounted[train_df_discounted["city"]==city]
city_frame[city] = df
kolkata_df = city_frame[1]
other_df = city_frame[2]
jaipur_df = city_frame[3]
mumbai_df = city_frame[4]
delhi_df = city_frame[5]
chennai_df = city_frame[6]
bengaluru_df = city_frame[7]
hyderabad_df = city_frame[8]
ahemdabad_df = city_frame[9]
lucknow_df = city_frame[10]
print(kolkata_df.shape)
print(other_df.shape)
print(jaipur_df.shape)
print(mumbai_df.shape)
print(delhi_df.shape)
print(chennai_df.shape)
print(bengaluru_df.shape)
print(hyderabad_df.shape)
print(ahemdabad_df.shape)
print(lucknow_df.shape)
## Yearly analysis
yearly_sales_df = train_df_discounted.groupby(["year"]).agg({"sales":sum}).reset_index()
yearly_sales_df = yearly_sales_df.sort_values(by="sales",ascending=False)
yearly_sales_df
ax = sns.barplot(x="sales", y="year", data=yearly_sales_df)
1) Overall 2017 had most no of sales 2) Overall 2018 had least no of sales
## Monthly analysis
monthly_sales_df = train_df_discounted.groupby(["month"]).agg({"sales":sum}).reset_index()
monthly_sales_df = monthly_sales_df.sort_values(by="sales", ascending=False)
monthly_sales_df
ax = sns.barplot(x="month", y="sales", data = monthly_sales_df)
March has the highest sales followed by January and December. August has the lowest sales
train_df_discounted_copy_city = train_df_discounted.copy()
city = {4: 'Mumbai', 5: 'Delhi', 7: 'Bangalore', 1: 'Kolkata',
6: 'Chennai', 8: 'Hyderabad', 9: 'Ahmedabad', 3: 'Jaipur', 10: 'Lucknow', 2: 'Other'}
train_df_discounted_copy_city["city"]=train_df_discounted_copy_city["city"].map(city)
train_df_discounted_copy_city.head()
import matplotlib.pyplot as plt
import seaborn as sns
def do_city_analysis(df, city):
city_df = df[df["city"] == city]
city_df["year"] = city_df["year"].astype("category")
city_df["month"] = city_df["month"].astype("category")
city_df["day"] = city_df["day"].astype("category")
fig_dims = (20, 20)
f, axes = plt.subplots(nrows=3, ncols=1, figsize = fig_dims)
yearly_sales_df = city_df.groupby(["year"]).agg({"sales":sum}).reset_index()
yearly_sales_df = yearly_sales_df.sort_values(by="sales", ascending=False)
sns.barplot(x="sales", y="year", data=yearly_sales_df, ax=axes[0])
monthy_sales_df = city_df.groupby(["month"]).agg({"sales":sum}).reset_index()
monthy_sales_df = monthy_sales_df.sort_values(by="sales", ascending=False)
sns.barplot(x="sales", y="month", data=monthy_sales_df, ax=axes[1])
daily_sales_df = city_df.groupby(['day']).agg({"sales":sum}).reset_index()
daily_sales_df = daily_sales_df.sort_values(by="sales", ascending = False)
sns.barplot(x="sales", y="day", data=daily_sales_df, ax=axes[2])
medicine_df = city_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
medicine_df = medicine_df.sort_values(by="sales", ascending=False)
zero_sales_meds = medicine_df[medicine_df["sales"]==0]["medicine"]
print(f"total # of medicenes with zero sale: {len(zero_sales_meds)}")
plt.show()
import matplotlib.pyplot as plt
def plot_top_meds(df, city, year, top=10):
df[(df["city"] == city) & (df["year"] == year)].groupby(["medicine"])["sales"].sum().sort_values(ascending = False)[:10].plot.bar()
plt.xticks(rotation = 45, ha = "right")
plt.title( f"top {top} medicines sold in {city} in {year}")
plt.ylabel("sales")
plt.show()
do_city_analysis(train_df_discounted_copy_city,"Kolkata")
# Top 10 medicines in 2015 in Kolkata
plot_top_meds(train_df_discounted_copy_city, "Kolkata",2015)
# Top 10 medicines in 2016 in Kolkata
plot_top_meds(train_df_discounted_copy_city, "Kolkata",2016)
# Top 10 medicines in 2017 in Kolkata
plot_top_meds(train_df_discounted_copy_city, "Kolkata",2017)
# Top 10 medicines in 2018 in Kolkata
plot_top_meds(train_df_discounted_copy_city, "Kolkata",2018)
kolkata_medicine_df = kolkata_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
kolkata_medicine_df = kolkata_medicine_df.sort_values(by="sales", ascending=False)
kolkata_zero_sales_meds = kolkata_medicine_df[kolkata_medicine_df["sales"]==0]["medicine"]
len(kolkata_zero_sales_meds)
kolkata_1077 = kolkata_df[kolkata_df["medicine"]==1077]
kolkata_1077_discount = kolkata_1077.groupby(["discounted"]).agg({"sales":sum}).reset_index()
kolkata_1077_discount
discount had no role to play in the sales of 1077
do_city_analysis(train_df_discounted_copy_city, "Jaipur")
# top 10 medicine in Jaipur in 2015
plot_top_meds(train_df_discounted_copy_city, city="Jaipur", year=2015)
# top 10 medicine in Jaipur in 2016
plot_top_meds(train_df_discounted_copy_city, city="Jaipur", year=2016)
# top 10 medicine in Jaipur in 2017
plot_top_meds(train_df_discounted_copy_city, city="Jaipur", year=2017)
# top 10 medicine in Jaipur in 2018
plot_top_meds(train_df_discounted_copy_city, city="Jaipur", year=2018)
jaipur_medicine_df = jaipur_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
jaipur_medicine_df = jaipur_medicine_df.sort_values(by="sales", ascending=False)
jaipur_zero_sales_meds = jaipur_medicine_df[jaipur_medicine_df["sales"]==0]
jaipur_zero_sales_meds = jaipur_zero_sales_meds["medicine"]
len(jaipur_zero_sales_meds)
jaipur_1077 = jaipur_df[jaipur_df["medicine"]==1077]
jaipur_1077_discount = jaipur_1077.groupby(["discounted"]).agg({"sales":sum}).reset_index()
jaipur_1077_discount
jaipur_2103 = jaipur_df[jaipur_df["medicine"]==2103]
jaipur_2103_discount = jaipur_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
jaipur_2103_discount
discount had no role to play in the sales of 1077 in jaipur discount had a role to play in the sales of 2103 in jaipur
do_city_analysis(train_df_discounted_copy_city, "Mumbai")
plot_top_meds(train_df_discounted_copy_city, city="Mumbai", year=2015)
plot_top_meds(train_df_discounted_copy_city, city="Mumbai", year=2016)
plot_top_meds(train_df_discounted_copy_city, city="Mumbai", year=2017)
plot_top_meds(train_df_discounted_copy_city, city="Mumbai", year=2018)
mumbai_medicine_df = mumbai_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
mumbai_medicine_df = mumbai_medicine_df.sort_values(by="sales", ascending=False)
mumbai_zero_sales_meds = mumbai_medicine_df[mumbai_medicine_df["sales"]==0]
mumbai_zero_sales_meds = mumbai_zero_sales_meds["medicine"]
len(mumbai_zero_sales_meds)
mumbai_1077 = mumbai_df[mumbai_df["medicine"]==1077]
mumbai_1077_discount = mumbai_1077.groupby(["discounted"]).agg({"sales":sum}).reset_index()
mumbai_1077_discount
mumbai_2103 = mumbai_df[mumbai_df["medicine"]==2103]
mumbai_2103_discount = mumbai_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
mumbai_2103_discount
do_city_analysis(train_df_discounted_copy_city, "Delhi")
plot_top_meds(train_df_discounted_copy_city, "Delhi", 2015)
plot_top_meds(train_df_discounted_copy_city, "Delhi", 2016)
plot_top_meds(train_df_discounted_copy_city, "Delhi", 2016)
plot_top_meds(train_df_discounted_copy_city, "Delhi", 2017)
plot_top_meds(train_df_discounted_copy_city, "Delhi", 2018)
delhi_medicine_df = delhi_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
delhi_medicine_df = delhi_medicine_df.sort_values(by="sales",ascending=False)
delhi_zero_sales_meds = delhi_medicine_df[delhi_medicine_df["sales"]==0]
delhi_zero_sales_meds = delhi_zero_sales_meds["medicine"]
len(delhi_zero_sales_meds)
delhi_2103 = delhi_df[delhi_df["medicine"]==2103]
delhi_2103_discount = delhi_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
delhi_2103_discount
delhi_574 = delhi_df[delhi_df["medicine"]==574]
delh_574_discount = delhi_574.groupby(["discounted"]).agg({"sales":sum}).reset_index()
delh_574_discount
discount have affect on sales of medicine 574 and 2103 in delhi
do_city_analysis(train_df_discounted_copy_city, "Chennai")
plot_top_meds(train_df_discounted_copy_city, "Chennai", 2015)
plot_top_meds(train_df_discounted_copy_city, "Chennai", 2016)
plot_top_meds(train_df_discounted_copy_city, "Chennai", 2017)
plot_top_meds(train_df_discounted_copy_city, "Chennai", 2018)
chennai_medicine_df = chennai_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
chennai_medicine_df = chennai_medicine_df.sort_values(by="sales", ascending=False)
chennai_zero_sales_meds = chennai_medicine_df[chennai_medicine_df["sales"]==0]
chennai_zero_sales_meds = chennai_zero_sales_meds["medicine"]
len(chennai_zero_sales_meds)
chennai_574 = chennai_df[chennai_df["medicine"]==574]
chennai_574_discount = chennai_574.groupby(["discounted"]).agg({"sales":sum}).reset_index()
chennai_574_discount
chennai_1517 = chennai_df[chennai_df["medicine"]==1517]
chennai_1517_discount = chennai_1517.groupby(["discounted"]).agg({"sales":sum}).reset_index()
chennai_1517_discount
chennai_1971 = chennai_df[chennai_df["medicine"]==1971]
chennai_1971_discount = chennai_1971.groupby(["discounted"]).agg({"sales":sum}).reset_index()
chennai_1971_discount
do_city_analysis(train_df_discounted_copy_city, "Bangalore")
plot_top_meds(train_df_discounted_copy_city, "Bangalore", 2015)
plot_top_meds(train_df_discounted_copy_city, "Bangalore", 2016)
plot_top_meds(train_df_discounted_copy_city, "Bangalore", 2017)
plot_top_meds(train_df_discounted_copy_city, "Bangalore", 2018)
bengaluru_medicine_df = bengaluru_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
bengaluru_medicine_df = bengaluru_medicine_df.sort_values(by="sales", ascending= False)
bengaluru_zero_sales_meds = bengaluru_medicine_df[bengaluru_medicine_df["sales"]==0]
bengaluru_zero_sales_meds =bengaluru_zero_sales_meds["medicine"]
len(bengaluru_zero_sales_meds)
bengaluru_medicine_df.head()
bengaluru_768 = bengaluru_df[bengaluru_df["medicine"]==768]
bengaluru_768_discount = bengaluru_768.groupby(["discounted"]).agg({"sales":sum}).reset_index()
bengaluru_768_discount
bengaluru_1517 = bengaluru_df[bengaluru_df["medicine"]==1517]
bengaluru_1517_discount = bengaluru_1517.groupby(["discounted"]).agg({"sales":sum}).reset_index()
bengaluru_1517_discount
bengaluru_1077 = bengaluru_df[bengaluru_df["medicine"]==1077]
bengaluru_1077_discount = bengaluru_1077.groupby(["discounted"]).agg({"sales":sum}).reset_index()
bengaluru_1077_discount
bengaluru_2103 = bengaluru_df[bengaluru_df["medicine"]==2103]
bengaluru_2103_discount = bengaluru_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
bengaluru_2103_discount
do_city_analysis(train_df_discounted_copy_city, "Hyderabad")
plot_top_meds(train_df_discounted_copy_city, "Hyderabad", 2015)
plot_top_meds(train_df_discounted_copy_city, "Hyderabad", 2016)
plot_top_meds(train_df_discounted_copy_city, "Hyderabad", 2017)
plot_top_meds(train_df_discounted_copy_city, "Hyderabad", 2018)
hyderabad_medicine_df = hyderabad_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
hyderabad_medicine_df = hyderabad_medicine_df.sort_values(by="sales", ascending= False)
hyderabad_zero_sales_meds = hyderabad_medicine_df[hyderabad_medicine_df["sales"]==0]
hyderabad_zero_sales_meds = hyderabad_zero_sales_meds["medicine"]
len(hyderabad_zero_sales_meds)
hyderabad_medicine_df.head()
hyderabad_574 = hyderabad_df[hyderabad_df["medicine"]==574]
hyderabad_574_discount = hyderabad_574.groupby(["discounted"]).agg({"sales":sum}).reset_index()
hyderabad_574_discount
hyderabad_1971 = hyderabad_df[hyderabad_df["medicine"]==1971]
hyderabad_1971_discount = hyderabad_1971.groupby(["discounted"]).agg({"sales":sum}).reset_index()
hyderabad_1971_discount
do_city_analysis(train_df_discounted_copy_city, "Ahmedabad")
plot_top_meds(train_df_discounted_copy_city, "Ahmedabad", 2015)
plot_top_meds(train_df_discounted_copy_city, "Ahmedabad", 2016)
plot_top_meds(train_df_discounted_copy_city, "Ahmedabad", 2017)
plot_top_meds(train_df_discounted_copy_city, "Ahmedabad", 2018)
ahemdabad_medicine_df = ahemdabad_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
ahemdabad_medicine_df = ahemdabad_medicine_df.sort_values(by="sales",ascending= False)
ahemdabad_zero_sales_meds = ahemdabad_medicine_df[ahemdabad_medicine_df["sales"]==0]
ahemdabad_zero_sales_meds = ahemdabad_zero_sales_meds["medicine"]
len(ahemdabad_zero_sales_meds)
ahemdabad_medicine_df.head()
ahemdabad_316 = ahemdabad_df[ahemdabad_df["medicine"]==316]
ahemdabad_316_discount = ahemdabad_316.groupby(["discounted"]).agg({"sales":sum}).reset_index()
ahemdabad_316_discount
ahemdabad_2103 = ahemdabad_df[ahemdabad_df["medicine"]==2103]
ahemdabad_2103_discount = ahemdabad_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
ahemdabad_2103_discount
do_city_analysis(train_df_discounted_copy_city, "Lucknow")
plot_top_meds(train_df_discounted_copy_city, "Lucknow", 2015)
plot_top_meds(train_df_discounted_copy_city, "Lucknow", 2016)
plot_top_meds(train_df_discounted_copy_city, "Lucknow", 2017)
plot_top_meds(train_df_discounted_copy_city, "Lucknow", 2018)
lucknow_medicine_df = lucknow_df.groupby("medicine").agg({"sales":sum}).reset_index()
lucknow_medicine_df = lucknow_medicine_df.sort_values(by="sales", ascending = False)
lucknow_zero_sales_meds = lucknow_medicine_df[lucknow_medicine_df["sales"]==0]
lucknow_zero_sales_meds = lucknow_zero_sales_meds["medicine"]
len(lucknow_zero_sales_meds)
lucknow_medicine_df.head()
lucknow_574 = lucknow_df[lucknow_df["medicine"]==574]
lucknow_574_discount = lucknow_574.groupby(["discounted"]).agg({"sales":sum}).reset_index()
lucknow_574_discount
lucknow_2103 = lucknow_df[lucknow_df["medicine"]==2103]
lucknow_2103_discount = lucknow_2103.groupby(["discounted"]).agg({"sales":sum}).reset_index()
lucknow_2103_discount
discount has affect on the sales of medicine 574 and 2103 in lucknow
do_city_analysis(train_df_discounted_copy_city, "Other")
For Other cities day 3 has most number of sales followed by day 2, 6 and 1. Day 31 has least number of sale
plot_top_meds(train_df_discounted_copy_city,"Other", 2015)
plot_top_meds(train_df_discounted_copy_city,"Other", 2016)
plot_top_meds(train_df_discounted_copy_city,"Other", 2017)
plot_top_meds(train_df_discounted_copy_city,"Other", 2018)
other_medicine_df = other_df.groupby("medicine").agg({"sales":sum}).reset_index()
other_medicine_df = other_medicine_df.sort_values(by="sales", ascending = False)
other_zero_sales_meds = other_medicine_df[other_medicine_df["sales"]==0]
other_zero_sales_meds = other_zero_sales_meds["medicine"]
len(other_zero_sales_meds)
other_medicine_df.head()
city_total_df = train_df_discounted.groupby(["city"]).agg({"sales":sum}).reset_index()
city_total_df = city_total_df.sort_values(by="sales", ascending=False)
city_total_df
ax = sns.barplot(x="sales", y="city", data=city_total_df)
# Mumbai has highest sales
# Other cities have least sales
Mumbai has most sales followed by Delhi and Bengaluru
overall_medicine_df = train_df_discounted.groupby("medicine").agg({"sales":sum}).reset_index()
overall_medicine_df = overall_medicine_df.sort_values(by="sales", ascending = False)
overall_medicine_df.head()
overall_medicine_df.tail()
Overall medicine 1077 has higest sales followed by 2103 and 574. Also medicine 2424 has least sales
As we know that overall March has the highest sales followed by January, December and August has the lowest sales. Let's see what happend in these months
jan_df = train_df_discounted[train_df_discounted["month"]==1]
march_df = train_df_discounted[train_df_discounted["month"]==3]
dec_df = train_df_discounted[train_df_discounted["month"]==12]
aug_df = train_df_discounted[train_df_discounted["month"]==8]
print(jan_df.shape)
print(march_df.shape)
print(dec_df.shape)
print(aug_df.shape)
jan_df.head()
jan_df.tail()
jan_daily_sales_df = jan_df.groupby(["day"]).agg({"sales":sum}).reset_index()
jan_daily_sales_df = jan_daily_sales_df.sort_values(by="sales", ascending=False)
jan_daily_sales_df.head()
jan_daily_sales_df.tail()
ax = sns.barplot(x="day", y="sales", data=jan_daily_sales_df)
There are no sales on Day 1. Day 2 had highest sales followed by day 4, 3 and 5. We observe seasonality across the whole month of march
jan_medicine_df = jan_df.groupby("medicine").agg({"sales":sum}).reset_index()
jan_medicine_df = jan_medicine_df.sort_values(by="sales", ascending = False)
jan_medicine_df.head()
jan_medicine_df.tail()
jan_zero_sales_meds = jan_medicine_df[jan_medicine_df["sales"]==0]
jan_zero_sales_meds = jan_zero_sales_meds["medicine"]
len(jan_zero_sales_meds)
In January medicine 1077 has most sale followed by 2103 Also there are 230 medcine that have zero sales in Jan accross, years and cities
march_daily_sales_df = march_df.groupby(["day"]).agg({"sales":sum}).reset_index()
march_daily_sales_df = march_daily_sales_df.sort_values(by="sales", ascending=False)
march_daily_sales_df.head()
march_daily_sales_df.tail()
ax = sns.barplot(x="day", y="sales", data=march_daily_sales_df)
For March Day 1 has most sale followed by day 16, 9 , 2. Day 25 has least sale. We see a spike in sales every week
march_medicine_df = march_df.groupby("medicine").agg({"sales":sum}).reset_index()
march_medicine_df = march_medicine_df.sort_values(by="sales", ascending = False)
march_medicine_df.head()
march_medicine_df.tail()
march_zero_sales_meds = march_medicine_df[march_medicine_df["sales"]==0]
march_zero_sales_meds = march_zero_sales_meds["medicine"]
len(march_zero_sales_meds)
In march medicine 2103 has most sale followed by 1077 Also there are 173 medcine that have zero sales in march accross, years and cities
dec_daily_sales_df = dec_df.groupby(["day"]).agg({"sales":sum}).reset_index()
dec_daily_sales_df = dec_daily_sales_df.sort_values(by="sales", ascending=False)
dec_daily_sales_df.head()
dec_daily_sales_df.tail()
ax = sns.barplot(x="day", y="sales", data=dec_daily_sales_df)
There are no sale on Christmas day. Day 21 has most sales followed by day 23 and 22. We also see some seasonality in December
dec_medicine_df = dec_df.groupby("medicine").agg({"sales":sum}).reset_index()
dec_medicine_df = dec_medicine_df.sort_values(by="sales", ascending = False)
dec_medicine_df.head()
dec_medicine_df.tail()
dec_zero_sales_meds = dec_medicine_df[dec_medicine_df["sales"]==0]
dec_zero_sales_meds = dec_zero_sales_meds["medicine"]
len(dec_zero_sales_meds)
In december medicine 412 has most sale followed by 2103 and 1077 Also there are 265 medcine that have zero sales in Dec accross, years and cities
aug_daily_sales_df = aug_df.groupby(["day"]).agg({"sales":sum}).reset_index()
aug_daily_sales_df = aug_daily_sales_df.sort_values(by="sales", ascending=False)
aug_daily_sales_df.head()
aug_daily_sales_df.tail()
ax = sns.barplot(x="day", y="sales", data=aug_daily_sales_df)
Day 31 has most sales followed by day 2 and 30. Day 27 has least sale. We also see seasonality in august
aug_medicine_df = aug_df.groupby("medicine").agg({"sales":sum}).reset_index()
aug_medicine_df = aug_medicine_df.sort_values(by="sales", ascending = False)
aug_medicine_df.head()
aug_medicine_df.tail()
aug_zero_sales_meds = aug_medicine_df[aug_medicine_df["sales"]==0]
aug_zero_sales_meds = aug_zero_sales_meds["medicine"]
len(aug_zero_sales_meds)
In aug medicine 1077 has most sale followed by 574 and 2103 Also there are 589 medcine that have zero sales in aug accross, years and cities
In the above graphs we have seen that there is huge difference in sales in 2015 and 2016. One thing that happened in 2016 is that discounts were introduced. So, Lets deep dive
df_2015 = train_df_discounted[train_df_discounted["year"]==2015]
df_2016 = train_df_discounted[train_df_discounted["year"]==2016]
print(df_2015.shape)
print(df_2016.shape)
monthy_sales_df_2015 = df_2015.groupby(["month"]).agg({"sales":sum}).reset_index()
monthy_sales_df_2015 = monthy_sales_df_2015.sort_values(by="sales", ascending=False)
monthy_sales_df_2015
ax = sns.barplot(x="month", y="sales", data=monthy_sales_df_2015)
In 2015 december has the highest sales followed by november and march and feb has the least sales. However, looking at the graph it feels that the sale never realy took off until december
monthy_sales_df_2016 = df_2016.groupby(["month"]).agg({"sales":sum}).reset_index()
monthy_sales_df_2016 = monthy_sales_df_2016.sort_values(by="sales", ascending=False)
monthy_sales_df_2016
ax = sns.barplot(x="month", y="sales", data=monthy_sales_df_2016)
Both in 2015 and 2016 highest sales in December. But, the sale has almost doubled
medicine_df_2015 = df_2015.groupby("medicine").agg({"sales":sum}).reset_index()
medicine_df_2015 = medicine_df_2015.sort_values(by="sales", ascending = False)
medicine_df_2015.head()
medicine_df_2015.tail()
zero_sales_meds_2015 = medicine_df_2015[medicine_df_2015["sales"]==0]
zero_sales_meds_2015 = zero_sales_meds_2015["medicine"]
len(zero_sales_meds_2015)
In 2015 medicine 1077 has most sale followed by 574 and 768 Also there are 1589 medcines that have zero sales in 2015. This implies almost half of the medicine went unsold in 2015
medicine_df_2016 = df_2016.groupby("medicine").agg({"sales":sum}).reset_index()
medicine_df_2016 = medicine_df_2016.sort_values(by="sales", ascending = False)
medicine_df_2016.head()
medicine_df_2016.tail()
zero_sales_meds_2016 = medicine_df_2016[medicine_df_2016["sales"]==0]
zero_sales_meds_2016 = zero_sales_meds_2016["medicine"]
len(zero_sales_meds_2016)
In 2016 medicine 2103 has most sale followed by 1077 and 2051 Also there are 774 medcines that have zero sales in 2016. This implies number of zero sales medicines reduced by 51%
train_df_discounted_copy = train_df_discounted.copy()
train_df_discounted_copy.city = train_df_discounted_copy.city.astype("int")
train_df_discounted_copy.dtypes
def convert_type_to_int(df, col_list):
for col in col_list:
df[col] = df[col].astype("int")
return df
train_df_discounted_copy = convert_type_to_int(df=train_df_discounted_copy, col_list=cat_columns)
train_df_discounted_copy.dtypes
city = {4: 'Mumbai', 5: 'Delhi', 7: 'Bangalore', 1: 'Kolkata',
6: 'Chennai', 8: 'Hyderabad', 9: 'Ahmedabad', 3: 'Jaipur', 10: 'Lucknow', 2: 'Other'}
train_df_discounted_copy["city"]=train_df_discounted_copy["city"].map(city)
import plotly.express as px
def get_category_order_sales_for(df, col, ascending=True):
temp=df.groupby([col])["sales"].sum().sort_values().reset_index()
temp["sales"]=np.round(temp["sales"])
category_orders = list(temp[col])
return category_orders
get_category_order_sales_for(df=train_df_discounted_copy, col="city")
def plot_agg_sales(df, groupby, x, color):
temp=df.groupby(groupby)["sales"].sum().reset_index()
temp["sales"]=np.round(temp["sales"])
title = f"aggregated sales for {x}"
cat_orders = get_category_order_sales_for(df, x)
fig5 = px.bar(temp, x=x, y="sales", barmode='group',
title=title, color = color,
category_orders={x:cat_orders})
fig5.show()
plot_agg_sales(train_df_discounted_copy, groupby = ["year", "city"], x = "city", color = "year")
plot_agg_sales(train_df_discounted_copy, groupby = ["month", "city"], x = "city", color = "month")
train_neg_df_discounted_copy = train_neg_df_discounted.copy()
city_frame = {}
for city in train_neg_df_discounted_copy.city.unique():
df = train_neg_df_discounted_copy[train_neg_df_discounted_copy["city"]==city]
city_frame[city] = df
kolkata_neg_data = city_frame[1]
other_neg_data = city_frame[2]
jaipur_neg_data = city_frame[3]
mumbai_neg_data = city_frame[4]
delhi_neg_data = city_frame[5]
chennai_neg_data = city_frame[6]
bengaluru_neg_data = city_frame[7]
hyderabad_neg_data = city_frame[8]
ahemdabad_neg_data = city_frame[9]
lucknow_neg_data = city_frame[10]
city = {4: 'Mumbai', 5: 'Delhi', 7: 'Bangalore', 1: 'Kolkata',
6: 'Chennai', 8: 'Hyderabad', 9: 'Ahmedabad', 3: 'Jaipur', 10: 'Lucknow', 2: 'Other'}
train_neg_df_discounted_copy["city"]=train_neg_df_discounted_copy["city"].map(city)
train_neg_df_discounted_copy.head()
train_neg_df_discounted_copy = change_to_category(train_neg_df_discounted_copy, cat_columns)
train_neg_df_discounted_copy.dtypes
def do_city_analysis(df, city):
city_df = df[df["city"] == city]
city_df["year"] = city_df["year"].astype("category")
city_df["month"] = city_df["month"].astype("category")
city_df["day"] = city_df["day"].astype("category")
fig_dims = (20, 20)
f, axes = plt.subplots(nrows=3, ncols=1, figsize = fig_dims)
yearly_sales_df = city_df.groupby(["year"]).agg({"sales":sum}).reset_index()
yearly_sales_df = yearly_sales_df.sort_values(by="sales", ascending=False)
sns.barplot(x="sales", y="year", data=yearly_sales_df, ax=axes[0])
monthy_sales_df = city_df.groupby(["month"]).agg({"sales":sum}).reset_index()
monthy_sales_df = monthy_sales_df.sort_values(by="sales", ascending=False)
sns.barplot(x="sales", y="month", data=monthy_sales_df, ax=axes[1])
daily_sales_df = city_df.groupby(['day']).agg({"sales":sum}).reset_index()
daily_sales_df = daily_sales_df.sort_values(by="sales", ascending = False)
sns.barplot(x="sales", y="day", data=daily_sales_df, ax=axes[2])
medicine_df = city_df.groupby(["medicine"]).agg({"sales":sum}).reset_index()
medicine_df = medicine_df.sort_values(by="sales", ascending=False)
zero_sales_meds = medicine_df[medicine_df["sales"]==0]["medicine"]
print(f"total # of medicenes with zero sale: {len(zero_sales_meds)}")
plt.show()
do_city_analysis(train_neg_df_discounted_copy, "Kolkata")
kolkata_medicine_data = kolkata_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
kolkata_medicine_data = kolkata_medicine_data.sort_values(by="sales", ascending=False)
kolkata_medicine_data.tail()
Medicine 1515 has most number of negative sales
do_city_analysis(train_neg_df_discounted_copy, "Jaipur")
For Jaipur 2018 has the highest negative sales and 2015 has the lowest negative sales For Jaipur April has most negative sales and October has the least negative sales 3.For Jaipur Day 30 has the highest negative sales and Day 26 has the least sales
jaipur_medicine_data = jaipur_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
jaipur_medicine_data = jaipur_medicine_data.sort_values(by="sales")
jaipur_medicine_data.head()
In Jaipur medicine 2401 has the highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Mumbai")
For Mumbai 2017 has most negative sales and 2016 has the least sales For Mumbai December has most negative sales followed by january, June and December. August has the least negative sales For Mumbai day 15 has the highest negative sales and Day 27 has the least negative sales
mumbai_medicine_data = mumbai_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
mumbai_medicine_data = mumbai_medicine_data.sort_values(by="sales")
mumbai_medicine_data.head()
for mumbai medicine 167 has the highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Delhi")
delhi_medicine_data = delhi_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
delhi_medicine_data = delhi_medicine_data.sort_values(by="sales")
delhi_medicine_data.head()
For Delhi medicine 792 has highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Chennai")
For Chennai 2018 has most negative sale and 2015 has least no of sales For Chennai April has the most negative sales and Aug has the least negative sales For Chennai day 24 has most negative sales and day 28 has the least Negative sales
chennai_medicine_data = chennai_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
chennai_medicine_data = chennai_medicine_data.sort_values(by="sales")
chennai_medicine_data.head()
In Chennai medicine 2472 has the highest negative sale
do_city_analysis(train_neg_df_discounted_copy, "Bangalore")
For Bengaluru 2017 has most negative sales and 2015 has least negative sales For Bengaluru August has most negative sales and October has least no of sales For Bengaluru Day 7 has most negative sales and day 23 has the least sales
bengaluru_medicine_data = bengaluru_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
bengaluru_medicine_data = bengaluru_medicine_data.sort_values(by="sales")
bengaluru_medicine_data.head()
For Bengaluru medicine 1098 has the highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Hyderabad")
For Hyderabad 2017 show most negative sales and 2015 has least sales For Hyderabad May has most negative sales July has least negative sales For Hyderabad Day 30 has the most negative sale and day 17, 23, 9 has the least sales
hyderabad_medicine_data = hyderabad_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
hyderabad_medicine_data = hyderabad_medicine_data.sort_values(by="sales")
hyderabad_medicine_data.head()
For Hyderabad medicine 846 has the highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Ahmedabad")
ahemdabad_medicine_data = ahemdabad_neg_data.groupby(["medicine"]).agg({"sales":sum}).reset_index()
ahemdabad_medicine_data = ahemdabad_medicine_data.sort_values(by="sales")
ahemdabad_medicine_data.head()
In ahemdabad medicine 1144 has highest negative
do_city_analysis(train_neg_df_discounted_copy, "Lucknow")
In Lucknow medicine 3194 has the highest negative sales
do_city_analysis(train_neg_df_discounted_copy, "Other")
For other cities 2018 has most negative sales and 2015 has the least sales For Other cities April had the most negative. Aug had the least negative sales For Other cities day 23 has most number of negative sales Day 3 has least negative sale
other_medicine_data = other_neg_data.groupby("medicine").agg({"sales":sum}).reset_index()
other_medicine_data = other_medicine_data.sort_values(by="sales")
other_medicine_data.head()
For other cities medicine 1762 has highest negative sales







